{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 333,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import os\n",
    "from dolma.taggers.url import BaseDomainTagger\n",
    "from dolma.core.paths import mkdir_p, parent, cached_path\n",
    "from typing import NamedTuple, Set\n",
    "import csv\n",
    "from collections import defaultdict, Counter\n",
    "import smart_open"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 334,
   "metadata": {},
   "outputs": [],
   "source": [
    "class Matched(NamedTuple):\n",
    "    source: str\n",
    "    url: str\n",
    "    count: int\n",
    "    wiki: dict\n",
    "\n",
    "class AggItem(NamedTuple):\n",
    "    sources: Set[str]\n",
    "    count: int\n",
    "\n",
    "    def update(self, source: str, count: int) -> 'AggItem':\n",
    "        return AggItem(self.sources | {source}, max(self.count, count))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 335,
   "metadata": {},
   "outputs": [],
   "source": [
    "url_reports = \"/Users/lucas/Documents/v1_6url-reports/v1_6-reports\"\n",
    "wikidata = \"/Users/lucas/Documents/wikidata/wikidata-20220208\"\n",
    "report_name = \"domain_blocklist_utp_v1\"\n",
    "report_name = \"blocklist_hosts_adware_malware_v1\"\n",
    "report_name =\"blocklist_hosts_social_v1\"\n",
    "report_name =\"blocklist_hosts_porn_v1\"\n",
    "report_name =\"blocklist_hosts_gambling_v1\"\n",
    "report_name =\"blocklist_hosts_fakenews_v1\"\n",
    "bad_words = \"https://raw.githubusercontent.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/master/en\"\n",
    "sheet_wiki = f\"/Users/lucas/Documents/url_sheets/{report_name}/wikidata.csv\"\n",
    "sheet_rest = f\"/Users/lucas/Documents/url_sheets/{report_name}/rest.csv\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 336,
   "metadata": {},
   "outputs": [],
   "source": [
    "reports = {}\n",
    "for report in os.listdir(url_reports):\n",
    "    if not report.endswith(\".json\"):\n",
    "        continue\n",
    "    if report_name not in report:\n",
    "        continue\n",
    "    with open(os.path.join(url_reports, report)) as f:\n",
    "        reports[report] = json.load(f)\n",
    "\n",
    "with open(cached_path(bad_words)) as f:\n",
    "    bad_words = set(f.read().splitlines())\n",
    "\n",
    "bad_wiki_words = {\n",
    "    \"sex\",\n",
    "    \"adult\",\n",
    "    \"satire\",\n",
    "    \"adult\",\n",
    "    \"gossip\",\n",
    "    \"tabloid\",\n",
    "    \"tracker\",\n",
    "    \"dating\",\n",
    "    \"image\",\n",
    "    \"humor\",\n",
    "    \"joke\",\n",
    "    \"comedy\",\n",
    "    \"porn\",\n",
    "    \"social media\",\n",
    "    \"freemium\",\n",
    "    \"betting\",\n",
    "    \"casino\",\n",
    "    \"gambling\",\n",
    "    \"celebrity\",\n",
    "    \"4chan\",\n",
    "    \"camming\",\n",
    "    \"escort\",\n",
    "    \"hentai\",\n",
    "    \"imageboard\",\n",
    "    \"image hosting\",\n",
    "    \"crowdfunding\",\n",
    "    \"nudity\",\n",
    "    \"comic\",\n",
    "    \"camming\",\n",
    "    \"online database\",\n",
    "}\n",
    "no_description = {\n",
    "    \"company\",\n",
    "    \"website\",\n",
    "\n",
    "}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 337,
   "metadata": {},
   "outputs": [],
   "source": [
    "wikidata_urls = {}\n",
    "\n",
    "for concept in os.listdir(wikidata):\n",
    "    if not os.path.isdir(os.path.join(wikidata, concept)):\n",
    "        continue\n",
    "    with open(os.path.join(wikidata, concept, 'response.json')) as f:\n",
    "        for data in json.load(f):\n",
    "            description = data.get(\"description\", \"\").strip()\n",
    "            if not description or any(bw in description for bw in bad_wiki_words) or description in no_description:\n",
    "                continue\n",
    "            try:\n",
    "                for url in BaseDomainTagger.clean_url(data['url']):\n",
    "                    wikidata_urls[url] = {\n",
    "                        \"label\": data[\"itemLabel\"], \"description\": data.get(\"description\", \"\")\n",
    "                    }\n",
    "            except:\n",
    "                pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 338,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package stopwords to /Users/lucas/nltk_data...\n",
      "[nltk_data]   Package stopwords is already up-to-date!\n",
      "[nltk_data] Downloading package wordnet to /Users/lucas/nltk_data...\n",
      "[nltk_data]   Package wordnet is already up-to-date!\n"
     ]
    }
   ],
   "source": [
    "import nltk\n",
    "nltk.download(\"stopwords\")\n",
    "nltk.download(\"wordnet\")\n",
    "\n",
    "from nltk.stem import WordNetLemmatizer\n",
    "from nltk.tokenize import word_tokenize\n",
    "from nltk.corpus import stopwords\n",
    "\n",
    "wnl = WordNetLemmatizer()\n",
    "stop_words = set(stopwords.words('english'))\n",
    "\n",
    "coll = Counter()\n",
    "\n",
    "for row in wikidata_urls.values():\n",
    "    if not row[\"description\"]:\n",
    "        continue\n",
    "\n",
    "    words = [w.lower() for w in word_tokenize(row[\"description\"]) if w.isalpha()]\n",
    "    words = [wnl.lemmatize(w) for w in words if w not in stop_words]\n",
    "    coll.update(words)\n",
    "\n",
    "# for k, v in coll.most_common(100):\n",
    "#     print(f\"{v:5d}\\t{k}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "matched = []\n",
    "for name, report in reports.items():\n",
    "    for url, count in report['domains']:\n",
    "        base_url = '.'.join(url.split('/')[0].split('.')[-2:])\n",
    "        base_url_with_www = f'www.{base_url}'\n",
    "\n",
    "        if url in wikidata_urls:\n",
    "            matched_url = url\n",
    "        elif base_url in wikidata_urls:\n",
    "            matched_url = base_url\n",
    "        elif base_url_with_www in wikidata_urls:\n",
    "            matched_url = base_url_with_www\n",
    "        else:\n",
    "            continue\n",
    "\n",
    "        matched.append(\n",
    "            Matched(name.strip(\".json\"), url, count, wikidata_urls[matched_url])\n",
    "        )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "seen = set()\n",
    "\n",
    "mkdir_p(parent(sheet_wiki))\n",
    "with smart_open.open(sheet_wiki, 'wt') as f:\n",
    "    writer = csv.DictWriter(f, fieldnames=[\"url\", \"count\", \"allow?\", \"label\", \"source\", \"description\", \"notes\"])\n",
    "    writer.writeheader()\n",
    "    for match in sorted(matched, key=lambda x: x.count, reverse=True):\n",
    "        if match.url in seen:\n",
    "            continue\n",
    "        seen.add(match.url)\n",
    "        writer.writerow({\n",
    "            \"source\": match.source,\n",
    "            \"url\": match.url,\n",
    "            \"count\": match.count,\n",
    "            \"allow?\": \"\",\n",
    "            \"label\": (match.wiki['label'] or \"  \"),\n",
    "            \"description\": (match.wiki['description'] or \"  \")\n",
    "        })"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "agg = defaultdict(lambda: AggItem(set(), 0))\n",
    "\n",
    "for name, report in reports.items():\n",
    "    for url, count in report[\"domains\"]:\n",
    "        if url not in seen:\n",
    "            agg[url] = agg[url].update(name, count)\n",
    "\n",
    "with smart_open.open(sheet_rest, 'wt') as f:\n",
    "    writer = csv.DictWriter(f, fieldnames=[\"url\", \"count\", \"allow?\", \"sources\"])\n",
    "    writer.writeheader()\n",
    "    for url, agg_item in sorted(agg.items(), key=lambda x: x[1].count, reverse=True):\n",
    "        if agg_item.count < 500:\n",
    "            continue\n",
    "        writer.writerow({\n",
    "            \"url\": url,\n",
    "            \"allow?\": \"\",\n",
    "            \"sources\": \",\".join(agg_item.sources),\n",
    "            \"count\": agg_item.count\n",
    "        })"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "dolma",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
